This data comes from kaggle.com, which has collected all recorded names for children born in the United States between the years 1880-2014.
## first read in baby names csv
## names <- read.csv('Baby-Name-Project/data/raw_data/NationalNames.csv')
## save as RDS and remove CSV to save space
names <- readRDS('Baby-Name-Project/data/raw_data/all-names.rds')
## reading in baby names by stat
## state <- read.csv('Baby-Name-Project/data/raw_data/StateNames.csv')
## save as RDS file and remove CSV
state <- readRDS('Baby-Name-Project/data/raw_data/state-names.rds')
## filter out only babies with names 'Nathan' or 'Nate' and 'Male'
## this is accomplished using the filter() function available through dplyr package
dnn <- names %>%
filter(Gender == 'M',
Name == 'Nathan' | Name == 'Nate' | Name == 'Nathanial' | Name == 'Nathaniel' | Name == 'Nathanael')
## filtering out all other names for each state
state_dnn <- state %>%
filter(Gender == 'M', Name == 'Nathan')
## summing total number of Nathan's for each state
state_dnn_sum <- state_dnn %>%
group_by(State) %>%
select(Name, state=State, Count) %>%
summarize(total=sum(Count))
## cleaner looking graph with annotations
p1 <- ggplot(data=dnn, aes(x=Year, y=Count, color=Name)) +
geom_line(size=1) +
labs(title='Baby boys with Nathan-related names',
subtitle='United States, 1880-2014',
caption='Source: www.kaggle.com') +
ylab('Number of babies') +
scale_x_continuous(breaks=seq(1880,2014, by=10)) +
annotate('rect', xmin=1989, xmax=1991, ymin=0, ymax=Inf, fill= 'cadetblue3', alpha=0.6) +
annotate('text', label = '1990', y=12500, x=1988, size=5, hjust='right') +
theme(
plot.title = element_text(face='bold', size = 16),
plot.subtitle = element_text(size=13),
plot.margin = unit(c(1,1,1,1), 'lines'),
axis.text = element_text(size=10, color='black'),
axis.title.y = element_text(size=12, face='bold', margin = margin(t=0,r=10,b=0,l=0)),
axis.title.x = element_text(size=12, face='bold', margin = margin(t=10,r=0,b=0,l=0)),
legend.position = c(0.15,0.7),
legend.text = element_text(size=12),
legend.title = element_text(size=12, face='bold'),
legend.background = element_rect(fill='white', size=0.5, linetype='solid', color='black')
)
p1
## interactive plot
p2 <- ggplot(data=dnn, aes(x=Year, y=Count, color=Name)) +
geom_line() +
geom_point() +
ylab('Number of babies') +
theme(
plot.margin = unit(c(1,1,1,1), 'lines')
)
## in order to make interactive, we will view the plot using ggplotly() function.
ggplotly(p2)
## you should be able to hover mouse over individual points to see count and year
## if you don't want the graph to show up inside R markdown file:
## click on gear aside of knit at top
## select 'Chuck Output in Console'
Making a heatplot for total number of Nathans born in each state
p3 <- plot_usmap(data=state_dnn_sum, values = 'total') +
scale_fill_continuous(name = 'Total', low='blue', high='red', labels=scales::comma) +
labs(title='Total Number of Nathans By State',
subtitle = 'US babies born 1880-2014',
caption='Source: Kaggle.com') +
theme(
legend.position = 'right',
legend.title = element_text(size=11, face='bold'),
legend.text = element_text(size=9),
plot.title = element_text(size=16, face='bold'),
plot.subtitle = element_text(size=13),
plot.caption = element_text(size=9)
)
p3